install.packages("Metrics")
install.packages("tidymodels")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("naniar")
install.packages("tidyverse")
install.packages("kableExtra")
install.packages("lubridate")
install.packages("readxl")
install.packages("highcharter")
install.packages("scales")
install.packages("RColorBrewer")
install.packages("wesanderson")
install.packages("plotly")
install.packages("shiny")
install.packages("readr")
install.packages("choroplethr")
install.packages("choroplethrMaps")
install.packages("GGally")
install.packages("zoo")
install.packages("scales")
install.packages("ggmap")
install.packages("stringr")
install.packages("gridExtra")
install.packages("caret")
install.packages("treemap")
install.packages("psych")
install.packages("DAAG")
install.packages("leaps")
install.packages("corrplot")
install.packages("glmnet")
install.packages("boot")
install.packages("ggpubr")
install.packages("gplots")
install.packages("nortest")
hr <- read.csv("HR_comma_sep.csv")
hr <- data.frame(hr)
head(hr)
## satisfaction_level last_evaluation number_project average_montly_hours
## 1 0.38 0.53 2 157
## 2 0.80 0.86 5 262
## 3 0.11 0.88 7 272
## 4 0.72 0.87 5 223
## 5 0.37 0.52 2 159
## 6 0.41 0.50 2 153
## time_spend_company Work_accident left promotion_last_5years Department salary
## 1 3 0 1 0 sales low
## 2 6 0 1 0 sales medium
## 3 4 0 1 0 sales medium
## 4 5 0 1 0 sales low
## 5 3 0 1 0 sales low
## 6 3 0 1 0 sales low
dim(hr)
## [1] 14999 10
colnames(hr)
## [1] "satisfaction_level" "last_evaluation" "number_project"
## [4] "average_montly_hours" "time_spend_company" "Work_accident"
## [7] "left" "promotion_last_5years" "Department"
## [10] "salary"
As we see, in column names there are some dimensions that has some categorical data. Initially when data is loaded they are read as character data type. In order to work on such variables their Data Types needs to be converted to Factor.
hr <- as.data.frame(unclass(hr), stringsAsFactors = TRUE)
str(hr)
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
Now we check the number of null values and variables that consist of null values in the dataset.
sum(is.na(hr))
## [1] 0
summary(hr)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## time_spend_company Work_accident left promotion_last_5years
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.498 Mean :0.1446 Mean :0.2381 Mean :0.02127
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## Department salary
## sales :4140 high :1237
## technical :2720 low :7316
## support :2229 medium:6446
## IT :1227
## product_mng: 902
## marketing : 858
## (Other) :2923
glimpse(hr)
## Rows: 14,999
## Columns: 10
## $ satisfaction_level <dbl> 0.38, 0.80, 0.11, 0.72, 0.37, 0.41, 0.10, 0.92, ~
## $ last_evaluation <dbl> 0.53, 0.86, 0.88, 0.87, 0.52, 0.50, 0.77, 0.85, ~
## $ number_project <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, ~
## $ average_montly_hours <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142~
## $ time_spend_company <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, ~
## $ Work_accident <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ left <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Department <fct> sales, sales, sales, sales, sales, sales, sales,~
## $ salary <fct> low, medium, medium, low, low, low, low, low, lo~
Here we can graphically visualize the null values in each attribute
gg_miss_var(hr)
Heat plot that clearly mention the features containing null values and overall percentage of missing and present values.
vis_miss(hr) + theme(axis.text.x = element_text(angle = 90))
This pie chart is used to find the types of departments in a company along with their percentages.
hr_dept <- data.frame(table(hr$Department))
hr_dept <- hr_dept[,c('Var1', 'Freq')]
fig <- plot_ly(hr_dept, labels = ~Var1, values = ~Freq, type = 'pie')
fig
# Group department variable with salary.
salary_dept <- hr %>%
group_by(Department, salary) %>%
summarize(Freq = n())
# Filtering salary and grouping it with particular department
salary_dept_1 <- hr %>%
filter(salary %in% c("high","medium","low")) %>%
group_by(Department) %>%
summarize(sum = n())
# Merging both variables in order to visualize and plot
salary_ratio <- merge (salary_dept, salary_dept_1, by="Department")
salary_ratio <- salary_ratio %>%
mutate(ratio = Freq/sum)
# Plot
ggplot(salary_ratio, aes(x=Department, y = ratio, fill = salary)) + geom_bar(position = "dodge", stat="identity") +
xlab("Department") + ylab ("Salary") +
scale_fill_discrete(name = "Salary Categories") +
scale_y_continuous(labels = scales::percent) +
coord_flip()
hr %>%
group_by(Department) %>%
summarise(mean_satisfaction_level = mean(satisfaction_level, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(Department, mean_satisfaction_level), y = mean_satisfaction_level, fill = Department)) +
geom_col(stat ="identity", color = "black", fill="#ba3a9e") +
coord_flip() +
theme_gray() +
labs(x = "Department", y = "Satisfaction Level") +
geom_text(aes(label = round(mean_satisfaction_level,digit = 4)), hjust = 2.0, color = "white", size = 3.5) +
ggtitle("Satisfaction Level for each Department", subtitle = "Satisfaction Level vs Department") +
xlab("Department") +
ylab("Satisfaction Level") +
theme(legend.position = "none",
plot.title = element_text(color = "black", size = 14, face = "bold", hjust = 1),
plot.subtitle = element_text(color = "black", hjust = 0.5),
axis.title.y = element_text(),
axis.title.x = element_text(),
axis.ticks = element_blank())
hr %>%
filter(!(is.na(salary))) %>%
filter(!(salary == "Unknown")) %>%
group_by(salary) %>%
summarise(mean_satisfaction_level = mean(satisfaction_level, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(salary, mean_satisfaction_level), y = mean_satisfaction_level, fill = salary)) +
geom_col(stat ="identity", color = "black", fill="#b9d2fa") +
coord_flip() +
theme_gray() +
labs(x = "Salary", y = "Satisfaction Level") +
geom_text(aes(label = round(mean_satisfaction_level,digit = 4)), hjust = 2.0, color = "black", size = 3.5) +
ggtitle("Mean Satisfaction Level comparison with Salary", subtitle = "Satisfaction Level vs Salary") +
xlab("Salary") +
ylab("Satisfaction Level") +
theme(legend.position = "none",
plot.title = element_text(color = "black", size = 14, face = "bold", hjust = 0.5),
plot.subtitle = element_text(color = "black", hjust = 0.5),
axis.title.y = element_text(),
axis.title.x = element_text(),
axis.ticks = element_blank())
## Warning: Ignoring unknown parameters: stat
## Coorelation Matrix of numeric dimensions Correlation plot is made to find relationship among features.
hr.corr <- hr %>%
select(satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years)
cor(hr.corr) # get the correlation matrix
## satisfaction_level last_evaluation number_project
## satisfaction_level 1.00000000 0.105021214 -0.142969586
## last_evaluation 0.10502121 1.000000000 0.349332589
## number_project -0.14296959 0.349332589 1.000000000
## average_montly_hours -0.02004811 0.339741800 0.417210634
## time_spend_company -0.10086607 0.131590722 0.196785891
## Work_accident 0.05869724 -0.007104289 -0.004740548
## left -0.38837498 0.006567120 0.023787185
## promotion_last_5years 0.02560519 -0.008683768 -0.006063958
## average_montly_hours time_spend_company Work_accident
## satisfaction_level -0.020048113 -0.100866073 0.058697241
## last_evaluation 0.339741800 0.131590722 -0.007104289
## number_project 0.417210634 0.196785891 -0.004740548
## average_montly_hours 1.000000000 0.127754910 -0.010142888
## time_spend_company 0.127754910 1.000000000 0.002120418
## Work_accident -0.010142888 0.002120418 1.000000000
## left 0.071287179 0.144822175 -0.154621634
## promotion_last_5years -0.003544414 0.067432925 0.039245435
## left promotion_last_5years
## satisfaction_level -0.38837498 0.025605186
## last_evaluation 0.00656712 -0.008683768
## number_project 0.02378719 -0.006063958
## average_montly_hours 0.07128718 -0.003544414
## time_spend_company 0.14482217 0.067432925
## Work_accident -0.15462163 0.039245435
## left 1.00000000 -0.061788107
## promotion_last_5years -0.06178811 1.000000000
corrplot(cor(hr.corr), method= "circle", bg = "black") # put this in a nice table
Before performing any technique we need to perform some preliminary tests.
Null Hypothesis: Data is normally distributed Alternative Hypothesis: Data is not normally distributed
par(mfrow = c(2,2))
hist(hr$satisfaction_level, freq= TRUE, main = paste("Histogram of Satisfaction Level"), xlab = 'satisfaction_level', col = '#32a88d')
hist(hr$last_evaluation, freq= TRUE, main = paste("Histogram of Last Evaluation"), xlab = 'satisfaction_level', col = '#3298a8')
hist(hr$average_montly_hours, freq= TRUE, main = paste("Histogram of Average Monthly Hours"), xlab = 'satisfaction_level', col = '#326da8')
hist(hr$number_project, freq= TRUE, main = paste("Histogram of Number of Projects"), xlab = 'satisfaction_level', col = '#5732a8')
## Boxplot
par(mfrow = c(2,2))
boxplot(hr$satisfaction_level, xlab = 'satifaction_level', col = '#32a88d')
boxplot(hr$last_evaluation, xlab = 'last_evaluation', col = '#3298a8')
boxplot(hr$average_montly_hours, xlab = 'average_montly_hours', col = '#326da8')
boxplot(hr$number_project, xlab = 'number_project', col = '#5732a8')
## QQ-Plot
qqnorm(hr$satisfaction_level)
qqline(hr$satisfaction_level, col = 'blue' )
## GGQQPLOT
ggqqplot(hr$satisfaction_level)
From the above plots one can conclude that data for satisfaction level is right skewed, which means that distribution is not normal. Hence, we will now implement non_parametric test i.e. Mann-Whitney U-test instead of T-test (Parametric Test).
summary(hr$satisfaction_level)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0900 0.4400 0.6400 0.6128 0.8200 1.0000
nortest::ad.test(hr$satisfaction_level)
##
## Anderson-Darling normality test
##
## data: hr$satisfaction_level
## A = 168.37, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[0:5000])
##
## Shapiro-Wilk normality test
##
## data: hr$satisfaction_level[0:5000]
## W = 0.94543, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[5001:10000])
##
## Shapiro-Wilk normality test
##
## data: hr$satisfaction_level[5001:10000]
## W = 0.9563, p-value < 2.2e-16
shapiro.test(hr$satisfaction_level[10001:14999])
##
## Shapiro-Wilk normality test
##
## data: hr$satisfaction_level[10001:14999]
## W = 0.95056, p-value < 2.2e-16
As the p-value is much smaller than threshold value i.e. 0.05. So we can reject null hypothesis and go with the alternative hypothesis. Alternative Hypothesis: Data is not normally distributed
Here we implemented shapiro-wilk test three times as this test has some limitation that it can only be implemented on 5000 observations so we used test three times according to dataset. Moreover, Anderson_Darling test is also applied to cross check the normality of data.
In this test, selected dimensions are satisfaction_level as metric variable and left as nominal variable.
Null Hypothesis: Left has no influence on satisfaction_level Alternative Hypothesis: Left has influence on satisfaction_level
wilcox.test(hr$satisfaction_level~hr$left)
##
## Wilcoxon rank sum test with continuity correction
##
## data: hr$satisfaction_level by hr$left
## W = 30522915, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
A Mann-Whitney U-test showed that left has significant influence (W = 30522915, p-value = 2.2e-16) on satisfaction level.
fig <- plot_ly(y = hr$satisfaction_level, x = hr$left, type = "box")
fig <- fig%>%layout(title = "Satisfaction_Level vs Left")
fig
In this test, we use categorical variables in order to find some relationship among the variables. Here, department and promotion_last_5years are used for testing
chisq_table <- table(hr$Department, hr$promotion_last_5years)
balloonplot(t(chisq_table), main = "Data Analysis for Chi Squared Test", label=FALSE, show.margins = FALSE, xlab = "Promotion Last 5 Years", ylab = "Department")
Implementing Chi-sq test using R formula
chisq <- chisq.test(chisq_table)
chisq
##
## Pearson's Chi-squared test
##
## data: chisq_table
## X-squared = 350.91, df = 9, p-value < 2.2e-16
Observed Frequencies
chisq$observed
##
## 0 1
## accounting 753 14
## hr 724 15
## IT 1224 3
## management 561 69
## marketing 815 43
## product_mng 902 0
## RandD 760 27
## sales 4040 100
## support 2209 20
## technical 2692 28
Expected Frequencies
round(chisq$expected, 0)
##
## 0 1
## accounting 751 16
## hr 723 16
## IT 1201 26
## management 617 13
## marketing 840 18
## product_mng 883 19
## RandD 770 17
## sales 4052 88
## support 2182 47
## technical 2662 58
Residuals
round(chisq$residuals, 3)
##
## 0 1
## accounting 0.084 -0.573
## hr 0.027 -0.181
## IT 0.666 -4.521
## management -2.239 15.190
## marketing -0.854 5.794
## product_mng 0.646 -4.380
## RandD -0.370 2.508
## sales -0.188 1.274
## support 0.587 -3.980
## technical 0.579 -3.924
Correlation Plot of Residuals
corrplot(chisq$residuals, is.cor = FALSE)
Now here department and salary are used for testing
chisq_table1 <- table(hr$Department, hr$salary)
balloonplot(t(chisq_table1), main = "Data Analysis for Chi Squared Test", label=FALSE, show.margins = FALSE, xlab = "Salary", ylab = "Department")
Chi-sq test using R formula
chisq_1 <- chisq.test(chisq_table1)
chisq_1
##
## Pearson's Chi-squared test
##
## data: chisq_table1
## X-squared = 700.92, df = 18, p-value < 2.2e-16
Observed Values
chisq_1$observed
##
## high low medium
## accounting 74 358 335
## hr 45 335 359
## IT 83 609 535
## management 225 180 225
## marketing 80 402 376
## product_mng 68 451 383
## RandD 51 364 372
## sales 269 2099 1772
## support 141 1146 942
## technical 201 1372 1147
Expected Values
round(chisq_1$expected, 2)
##
## high low medium
## accounting 63.26 374.12 329.63
## hr 60.95 360.46 317.59
## IT 101.19 598.49 527.32
## management 51.96 307.29 270.75
## marketing 70.76 418.50 368.74
## product_mng 74.39 439.96 387.65
## RandD 64.91 383.87 338.22
## sales 341.43 2019.35 1779.21
## support 183.83 1087.23 957.94
## technical 224.32 1326.72 1168.95
Residuals
round(chisq_1$residuals, 3)
##
## high low medium
## accounting 1.351 -0.833 0.296
## hr -2.043 -1.341 2.323
## IT -1.809 0.430 0.335
## management 24.007 -7.262 -2.780
## marketing 1.098 -0.807 0.378
## product_mng -0.741 0.526 -0.236
## RandD -1.726 -1.014 1.837
## sales -3.920 1.772 -0.171
## support -3.159 1.782 -0.515
## technical -1.557 1.243 -0.642
Correlation Plot of Residuals
corrplot(chisq_1$residuals, is.cor = FALSE)
Next test is One Factor ANOVA test but As the data is not normally distributed so we will use Kruskal Wallis Test. ## Kruskal Wallis Test In order to implement kruskal wallis test, Department and satisfaction_level are the selected dimensions.
Null Hypothesis: No significant differences between the satisfaction level of employees among various department Alternative Hypothesis: There is a significant differences between the satisfaction level of employees among various department
Dept <- hr$Department
SL <- hr$satisfaction_level
KWT_data <- data.frame(Dept, SL)
levels(KWT_data$Dept)
## [1] "accounting" "hr" "IT" "management" "marketing"
## [6] "product_mng" "RandD" "sales" "support" "technical"
KWT_data$Dept <- ordered(KWT_data$Dept, levels = c("accounting","hr","IT","management","marketing","product_mng","RandD","sales","support","technical"))
group_by(KWT_data, Dept) %>%
summarise(count = n(),mean = mean(KWT_data$SL),sd = sd(KWT_data$SL),median = median(KWT_data$SL),IQR = IQR(KWT_data$SL))
## # A tibble: 10 x 6
## Dept count mean sd median IQR
## <ord> <int> <dbl> <dbl> <dbl> <dbl>
## 1 accounting 767 0.613 0.249 0.64 0.38
## 2 hr 739 0.613 0.249 0.64 0.38
## 3 IT 1227 0.613 0.249 0.64 0.38
## 4 management 630 0.613 0.249 0.64 0.38
## 5 marketing 858 0.613 0.249 0.64 0.38
## 6 product_mng 902 0.613 0.249 0.64 0.38
## 7 RandD 787 0.613 0.249 0.64 0.38
## 8 sales 4140 0.613 0.249 0.64 0.38
## 9 support 2229 0.613 0.249 0.64 0.38
## 10 technical 2720 0.613 0.249 0.64 0.38
fig <- plot_ly(ggplot2::diamonds, y = ~KWT_data$SL, x=~KWT_data$Dept, type = "box", color = KWT_data$Dept)
fig
ggline(KWT_data, x = "Dept", y = "SL",
add = c("mean_se", "jitter"),
order = c("accounting","hr","IT","management","marketing","product_mng","RandD","sales","support","technical"),
ylab = "Satisfaction Level", xlab = "Departments") + theme(axis.text.x = element_text(angle = 45, hjust = 0.4, vjust = 0.5))
kruskal.test(KWT_data$SL ~ KWT_data$Dept)
##
## Kruskal-Wallis rank sum test
##
## data: KWT_data$SL by KWT_data$Dept
## Kruskal-Wallis chi-squared = 18.296, df = 9, p-value = 0.03189
The result depicts that p-value is less than significance level, so we can conclude that there are significant differences of satisfaction level among employees of different departments but as we can also see that there isn’t a big difference within calculated p-value and significance level, in that case we can say as well that there is a small difference in satisfaction level among employees of different departments but not the massive difference.
For correlation, two variables are taken under consideration which includes “time_spend_company”, “work_accident” and “number_project”. Furthermore, we have to find whether to implement non-parametric or parametric test. For this, normality of considered variables will be checked.
par(mfrow = c(1,2))
hist(hr$number_project , freq= TRUE, main = paste("Histogram of Number of Porjects"), xlab = 'number_project', col = '#53b56d')
hist(hr$time_spend_company , freq= TRUE, main = paste("Histogram of Time_Spend_Company"), xlab = 'Time_Spend_Company', col = '#53b56d')
Above graphs depicts that data is not normally distributed, in fact, it’s left skewed. For more clarification, we visualize QQ plot and perform normality test as well.
qqnorm(hr$number_project)
qqline(hr$number_project, col = 'blue' )
qqnorm(hr$time_spend_company)
qqline(hr$time_spend_company, col = 'blue' )
summary(hr$number_project)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 4.000 3.803 5.000 7.000
nortest::ad.test(hr$number_project)
##
## Anderson-Darling normality test
##
## data: hr$number_project
## A = 444.73, p-value < 2.2e-16
summary(hr$time_spend_company)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 3.498 4.000 10.000
nortest::ad.test(hr$time_spend_company)
##
## Anderson-Darling normality test
##
## data: hr$time_spend_company
## A = 930.99, p-value < 2.2e-16
ggscatter(hr, x = "number_project", y = "time_spend_company",
add = "reg.line", conf.int = TRUE,
cor.coef = TRUE, cor.method = "pearson",
xlab = "Number of Projects", ylab = "Time Spend in Company")
## `geom_smooth()` using formula 'y ~ x'
cor.test(hr$number_project, hr$time_spend_company, method = "spearman", exact = FALSE)
##
## Spearman's rank correlation rho
##
## data: hr$number_project and hr$time_spend_company
## S = 4.2068e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.251971
HR <- hr
y <- HR %>% select(satisfaction_level)
x <- HR %>% select(number_project, last_evaluation, time_spend_company, salary, Work_accident)
set.seed(0)
HR_split <- initial_split(HR, prop = 3/4)
HR_training <- HR_split %>%
training()
HR_test <- HR_split %>%
testing()
train_index <- as.integer(rownames(HR_training))
test_index <- as.integer(rownames(HR_test))
HR[train_index,'split'] = 'train'
HR[test_index,'split'] = 'test'
lm_model <- linear_reg() %>%
set_engine('lm') %>%
set_mode('regression') %>%
fit(satisfaction_level ~ number_project + time_spend_company + salary + Work_accident + last_evaluation, data = HR_training)
prediction <- lm_model %>%
predict(x)
colnames(prediction) <- c('prediction')
HR = cbind(HR, prediction)
hist_top <- ggplot(HR,aes(x=satisfaction_level)) +
geom_histogram(data=subset(HR,split == 'train'),fill = "red", alpha = 0.2, bins = 6) +
geom_histogram(data=subset(HR,split == 'test'),fill = "blue", alpha = 0.2, bins = 6) +
theme(axis.title.y=element_blank(),axis.text.y=element_blank(),axis.ticks.y=element_blank())
hist_top <- ggplotly(p = hist_top)
scatter <- ggplot(HR, aes(x = satisfaction_level, y = prediction, color = split)) +
geom_point() +
geom_smooth(formula=y ~ x, method=lm, se=FALSE)
scatter <- ggplotly(p = scatter, type = 'scatter')
hist_right <- ggplot(HR,aes(x=prediction)) +
geom_histogram(data=subset(HR,split == 'train'),fill = "red", alpha = 0.2, bins = 13) +
geom_histogram(data=subset(HR,split == 'test'),fill = "blue", alpha = 0.2, bins = 13) +
theme(axis.title.x=element_blank(),axis.text.x=element_blank(),axis.ticks.x=element_blank())+
coord_flip()
hist_right <- ggplotly(p = hist_right)
s <- subplot(
hist_top,
plotly_empty(),
scatter,
hist_right,
nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), margin = 0,
shareX = TRUE, shareY = TRUE, titleX = TRUE, titleY = TRUE
)
## Warning: No trace type specified and no positional attributes specified
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Can only have one: config
layout(s, showlegend = FALSE)
lm_model <- lm(satisfaction_level ~ number_project + time_spend_company + salary + Work_accident + last_evaluation, data = HR_training )
summary(lm_model)
##
## Call:
## lm(formula = satisfaction_level ~ number_project + time_spend_company +
## salary + Work_accident + last_evaluation, data = HR_training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63834 -0.18607 0.01932 0.19455 0.60152
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.644361 0.013383 48.149 < 2e-16 ***
## number_project -0.037271 0.001989 -18.741 < 2e-16 ***
## time_spend_company -0.014319 0.001594 -8.985 < 2e-16 ***
## salarylow -0.039653 0.008516 -4.656 3.26e-06 ***
## salarymedium -0.017722 0.008597 -2.061 0.0393 *
## Work_accident 0.043528 0.006414 6.786 1.21e-11 ***
## last_evaluation 0.252203 0.014177 17.790 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2401 on 11242 degrees of freedom
## Multiple R-squared: 0.05756, Adjusted R-squared: 0.05706
## F-statistic: 114.4 on 6 and 11242 DF, p-value: < 2.2e-16
R squared
summary(lm_model)$r.squared
## [1] 0.05756131
Adjusted R squared
summary(lm_model)$adj.r.squared
## [1] 0.05705832
Plot Linear Model
par(mfrow = c(2,2))
plot(lm_model)
prediction <- predict(object = lm_model, newdata = HR_test)
Testing accuracy of the data using mean absolute error
Metrics::mae(prediction, HR_test$satisfaction_level)
## [1] 0.205784
Mean Absolute Error is 0.2.This indicates that the average absolute difference between the observed values and the predicted values is approximately 0.2.
LR_HR <- hr
LR_HR$salary_0 <- ifelse(LR_HR$salary == 'low', 1, 0)
LR_HR$salary_1 <- ifelse(LR_HR$salary == 'medium', 1, 0)
LR_HR$salary_2 <- ifelse(LR_HR$salary == 'high', 1, 0)
LR_HR <- LR_HR %>%
select(-salary,-Department,-Work_accident)
LR_HR <- data.frame(LR_HR)
colnames(LR_HR)
## [1] "satisfaction_level" "last_evaluation" "number_project"
## [4] "average_montly_hours" "time_spend_company" "left"
## [7] "promotion_last_5years" "salary_0" "salary_1"
## [10] "salary_2"
set.seed(123)
training.samples <- LR_HR$satisfaction_level %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- LR_HR[training.samples, ]
test.data <- LR_HR[-training.samples, ]
model <- glm( left ~.+0,
data = data.frame(train.data))
summary(model)$coef
## Estimate Std. Error t value Pr(>|t|)
## satisfaction_level -0.6587603939 1.446215e-02 -45.550665 0.000000e+00
## last_evaluation 0.0925642591 2.266551e-02 4.083927 4.456870e-05
## number_project -0.0353490138 3.296096e-03 -10.724511 1.032118e-26
## average_montly_hours 0.0006283654 7.903627e-05 7.950343 2.026222e-15
## time_spend_company 0.0359188298 2.450352e-03 14.658643 3.098454e-48
## promotion_last_5years -0.1262921114 2.462190e-02 -5.129259 2.954252e-07
## salary_0 0.5150207116 2.092876e-02 24.608277 1.722938e-130
## salary_1 0.4284465899 2.122426e-02 20.186647 3.852667e-89
## salary_2 0.2979233185 2.388676e-02 12.472320 1.754324e-35
prob <- model %>% predict(test.data, type="response")
head(prob)
## 5 10 24 25 39 46
## 0.4563813 0.4136868 0.3891538 0.4369158 0.6488003 0.1821503
pred <- ifelse(prob > 0.5, 1,0)
head(pred)
## 5 10 24 25 39 46
## 0 0 0 0 1 0
mean(pred == test.data$left)
## [1] 0.7732578